#!/usr/bin/perl
#
#
# Copyright 2014, NICTA
#
# This software may be distributed and modified according to the terms of
# the BSD 2-Clause license. Note that NO WARRANTY is provided.
# See "LICENSE_BSD2.txt" for details.
#
# @TAG(NICTA_BSD)
#
#
# TODO: fight code duplication
# TODO: improve interface of subs
# TODO: global var with current file name (ofn) for error messages?
# TODO: use File::Basename::fileparse() instead of splitname?
# TODO: revisit defaults for make format
# TODO: report problems occurring multiple times only once?
# TODO: avoid special accounting for dot as well as make

use strict 'vars';
use less 'money';
use Getopt::Long;
use Pod::Usage;
use File::Spec::Functions;
use Cwd;

sub shell_quote { my ($_) = @_; s/([ ><();*?\\|#'"])/\\$1/g; return $_; }
my $cmdline = join(' ', map { shell_quote($_) } ($0, @ARGV));

my %o = (verbose => 1, basedir => '.', uses => '1', recursive => '1',
         'isa-home' => $ENV{ISABELLE_HOME}, format => 'make',
         'graph-attrs' => 'fontsize=18 style=filled color="#eecc99" bgcolor=white',
         'edge-attrs' => 'color=black arrowsize=0.6',
         'cluster-attrs' => '');

Getopt::Long::Configure("gnu_getopt");
GetOptions(\%o, qw{help|h verbose|v+}, 'quiet|q' => sub {$o{verbose}=0},
           qw{basedir|D=s uses! recursive!},
           'r' => sub {$o{recursive}=1}, 'R' => sub {$o{recursive}=0},
           qw{isa-home|H=s thypath|P=s ignore|i=s@ prune|p=s@},
           qw{format|T=s graph-attrs|G=s edge-attrs|E=s cluster-attrs|C=s},
           qw{target|t=s output|o=s})
  || pod2usage(2);
pod2usage(1) if $o{help};

@ARGV || pod2usage("$0: no input files given");
my @thyfiles = @ARGV;

sub diagnose {
  my $level = shift;
  warn @_ unless $level gt $o{verbose};
}

# converts the path $relpath given relative to $basedir into an absolute path
# It is similar to File::Spec::rel2abs but collapses "x/../y" into "y".
sub rel2abs {
  my $relpath = shift;
  my $basedir = shift;
  my $abspath = File::Spec::Functions::rel2abs($relpath, $basedir);

  if ($relpath =~ s/^(~~|\$ISABELLE_HOME)//) {
    if ($o{'isa-home'}) {
      $abspath =
        File::Spec::Functions::rel2abs($o{'isa-home'}.$relpath);
    }
    else {
      diagnose (2, qq{DEBUG: encountered "$1" but ISABELLE_HOME not set.\n});
      return $1.$relpath;
    }
  }
  return Cwd::abs_path($abspath) || $abspath; # avoid returning the empty string
  # abs_path returns the empty string if the path could not be resolved
  # because of a missing directory (the file itself does not need to exist)
}

# converts the path $abspath into a path relative to $basedir.
# returns $abspath if $basedir is '//'
sub abs2rel {
  my $abspath = shift;
  my $basedir = shift;
  return ($basedir eq '//'? $abspath :
          File::Spec::Functions::abs2rel($abspath, $basedir));
}

$o{basedir} = rel2abs($o{basedir}) unless $o{basedir} eq '//';
@{$o{ignore}} = ('Main') unless defined $o{ignore};
my @thypath = map { rel2abs($_) } split(/:/,$o{thypath});
unless (defined $o{output}) {
  if ($o{format} eq 'make') { $o{output} = 'thydeps.d'}
  elsif ($o{format} eq 'text') { $o{output} = 'thydeps.txt'}
  else { $o{output} = "thydeps.$o{format}"}
  $o{output} = abs2rel(rel2abs($o{output}, $o{basedir}));
}

diagnose(2, "DEBUG: ".join(' ', map "$_: '$o{$_}'", keys %o)."\n");

my %deps;
my %fname;
my %leaves; # dependencies that are not among the keys. HASH avoids duplicates
my %missing; # file names that could not be found. HASH avoids duplicates

# BUG: does not handle filenames without a dot (but that's unlikely to happen).
sub splitname { my $_ = shift; m{^(.*/)?([^/]*)\.([^.]*)$}; return ($1,$2,$3); }

# tries to find the theory with file name $relpath given relative to $basedir
# and returns the absolute file path
# It is similar to sub rel2abs but uses $o{thypath} to locate theory files.
# Certainly, files must exist.
sub findthy {
  my $relpath = shift;
  my $basedir = shift;
  my ($dir,$tn,$ext) = splitname($relpath);
  my @dirs = (rel2abs($dir,$basedir), @thypath);

  while ($#dirs>=0) {
    my $abspath = (shift @dirs)."/$tn.$ext";
    return $abspath if -r $abspath;
  }
  return;
}

### INPUT

# process a theory dependency
# TODO: too many arguments?
sub thy_dep {
  my $dtn = shift; # dependent theory name (without thy extension)
  my $ofn = shift; # original filename as specified by the user (used in error messages)
  my $afn = shift; # absolute filename (hopefully unique) internal identifier

  $dtn =~ m{^(?:.*/)?([^/]*)$}; 
  foreach my $_ (@{$o{ignore}}) { return if $_ eq $1; };

  my $dir = (splitname($afn))[0];
  my $dfn = findthy("$dtn.thy", $dir);
  if (defined $dfn) {
    push @{$deps{$afn}}, abs2rel($dfn, $o{basedir});
    if ($o{recursive}) { push @thyfiles, abs2rel($dfn); }
    else { $leaves{$dfn} = 1; }
  }
  else {
    diagnose(1, qq{$ofn: dependent theory "$dtn" not found.\n});
    $afn = rel2abs("$dtn.thy", $dir);
    $missing{$afn} = 1;
  }
}

# FIXME: tags and crazy names (e.g. theory 007 or theory \<AA>) not supported
sub read_thy {
  my $re_word    = '\b[a-zA-Z][a-zA-Z0-9_]*\b';  # a word starts with a letter
  my $re_id      = '(?:'.$re_word.'|"[^"]*")';   # bare word or "quoted string"
  my $re_braces  = '\{\*(?:[^*]|\*(?!\}))*\*\}'; # {* curly braced string *}
  my $re_comment = '\(\*(?:[^*]|\*(?!\)))*\*\)'; # (* parenthesized comment *)
  my $re_string  = '"[^"]*"';                    # "simple quoted string"

  my $ofn = shift; # original filename as specified by the user (used in error messages)
  my $afn = shift; # absolute filename (hopefully unique) internal identifier

  my ($dir, $tn, $ty) = splitname($afn);
  foreach my $_ (@{$o{prune}}) { $leaves{$afn} = 1 and return if $_ eq $tn; };
  
  open(FILE, "<", "$afn") || die "$afn: $!";
  my $content = '';
  while (not eof(FILE) and
         (not $content =~ m/\bbegin\b/ or $content =~ m/--|\bheader\b|\(\*/)) {
    $content .= <FILE>;
    $content =~ s/\n$/ /;
    $content =~ s/(?:$re_comment|(?:--|\bheader)\s*(?:$re_braces|$re_id))//gc;
  }
  my $body = '';
  while (not eof(FILE)) {
      $body .= <FILE>;
  }
  $body =~ s/\n/ /;
  close(FILE) || die "$!";

  $_ = $content;
  if (s/^\s*theory\s*($re_id)\s*//) {
    diagnose(1, "$ofn: theory name $1 does not match file name, assuming $tn.\n")
      unless $1 eq $tn;
    @{$deps{$afn}} = ();
    if (s/^imports\s*//) {
      while ((not m/^(?:keywords|uses|begin)\b/) and s/^($re_id)\s*//) {
        $1 =~ m/^"?([^"]*)"?$/; thy_dep($1, $ofn, $afn);
      }
    }

    if (s/^keywords\s*//) {
      while (not m/^(?:uses|begin)\b/ and
             s/^(?:"[^"]*"|and\b\s*|::\s*$re_id)\s*//) {}
    }

    if (s/^uses\s*//) {
      while ((not m/^(?:begin)\b/) and s/^\(?($re_id)\)?\s*//) {
        if ($o{uses}) {
          $1 =~ m/^"?([^"]*)"?$/;
          my $dfn = rel2abs($1, $dir);
          $dfn .= ".ML" if -r "$dfn.ML";
          if (-r $dfn) {
            push @{$deps{$afn}}, abs2rel($dfn, $o{basedir});
            $leaves{$dfn} = 1;
          }
          else {
            diagnose(1, qq{$ofn: dependent file "}.abs2rel($dfn).
                        qq{" not readable.\n});
            $missing{$dfn} = 1;
          }
        }
      }
    }

    unless (m/^begin\b/) {
      diagnose (1, qq{$ofn: parse error\nDEBUG: "}.substr($_, 0, 70).qq{"\n});
    }

    # Find: ML_file "filename.ML"
    if ($o{uses}) {
      while ($body =~ s/ML_file\s+($re_string)//) {
        $1 =~ m/^"?([^"]*)"?$/;
        my $dfn = rel2abs($1, $dir);
        $dfn .= ".ML" if -r "$dfn.ML";
        if (-r $dfn) {
          push @{$deps{$afn}}, abs2rel($dfn, $o{basedir});
          $leaves{$dfn} = 1;
        }
        else {
          diagnose(1, qq{$ofn: dependent file "}.abs2rel($dfn).
            qq{" not readable.\n});
          $missing{$dfn} = 1;
        }
      }
    }
  }
  else {
    diagnose (1, "$ofn: parse error, file skipped.\n".
                 qq{DEBUG: "}.substr($content, 0, 70).qq{"\n});
  }
}

sub read_ml {
  my $re_string  = '"[^"]*"';
  my $re_comment = '\(\*(?:[^*]|\*(?!\)))*\*\)'; # (* parenthesized comment *)

  my $ofn = shift; # original filename as specified by the user (used in error messages)
  my $afn = shift; # absolute filename (hopefully unique) internal identifier
  my ($dir, $tn, $ty) = splitname($afn);

  open(FILE, "<", "$afn") || die "$afn: $!";
  my $content = join (" ", <FILE>);
  close(FILE) || die "$!";
  $content =~ s/$re_comment//gc;
  $content =~ s/\n/ /g;

  while ($content =~ s/\b(?:with_quick_and_dirty_)?use_thy\s*"([^"]*)"//) {
    thy_dep($1, $ofn, $afn);
  }
  while ($content =~ 
         s/\b(?:use_thys|with_skip_proofs_use_thys)\s*(\[\s*$re_string\s*(?:,\s*$re_string\s*)*\])//c) {
    $1 =~ m/^\[(.*)\]$/;
    $_ = $1;
    while (s/"([^"]*)"\s*(?:,\s*)?//) {
      thy_dep($1, $ofn, $afn);
    }
  }
}

while (@thyfiles > 0) {
  my $ofn = shift @thyfiles; my $afn = rel2abs($ofn);
  diagnose(2, "DEBUG: $ofn is imported twice along the graph.\n") and
  next if $fname{$afn};
  my ($dir, $tn, $ty) = splitname($afn);
  $fname{$afn} = abs2rel($afn, $o{basedir});
  if ($ty eq 'thy') { read_thy ($ofn, $afn); }
  elsif ($ty eq 'ML') { read_ml ($ofn, $afn); }
  else { diagnose (1, "$ofn: file type $ty not yet supported.\n"); }
}

### OUTPUT

sub write_make {
  my $fn = shift;
  open(FILE,">$fn") || die "$fn: $!";
  print FILE "# NOTE: This file has been generated by the following command:\n";
  print FILE "# $cmdline\n";
  print FILE join(' ', abs2rel($o{target}, $o{basedir}).':',
                       map {shell_quote($_)}
                           (sort (keys %deps, keys %leaves, keys %missing)))
             ."\n";
  print FILE join(' ', abs2rel($fn, $o{basedir}).':',
                       map {shell_quote($_)} (sort (keys %deps)))."\n"
    unless $fn eq '-';
  close FILE;
}

sub write_dot {
  my $fn = shift;
  
  ## clustering according to directories
  my %clusters;
  my $deps;
  foreach my $e (keys %deps) {
    my ($dir,$label,$ext) = splitname($e);
    $label .= ".$ext" unless $ext eq 'thy';
    my $rfn = abs2rel($e,$o{basedir});
    $deps .= qq!\n  "$rfn" -> {!;
    foreach my $dep (@{$deps{$e}}) { $deps .= qq{ "$dep"}; }
    $deps .= " };";
    $clusters{$dir} .= qq{    "$rfn" [label="$label"];\n};
  }
  foreach my $e (keys %leaves) {
    my ($dir,$label,$ext) = splitname($e);
    $label .= ".$ext" unless $ext eq 'thy';
    my $rfn = abs2rel($e,$o{basedir});
    $clusters{$dir} .= qq{    "$rfn" [label="$label"];\n};
  }

  ## output
  open(FILE,">$fn") || die "$fn: $!";
  print FILE "// NOTE: This file has been generated by the following command:\n";
  print FILE "// $cmdline\n";
  print FILE "digraph {\n  graph[$o{'graph-attrs'}];\n"
                       ."  edge[$o{'edge-attrs'}];\n";
  foreach my $e (keys %clusters) {
    my $rfn = abs2rel($e,$o{basedir}) || ".";
    print FILE qq[\n  subgraph "cluster_$rfn" {].
    qq[ $o{'cluster-attrs'} label="\\"$rfn\\"";\n$clusters{$e}  };\n];
  }
  print FILE "$deps\n}\n";
  close FILE;
}

sub write_text {
  my $fn = shift;
  open(FILE,">$fn") || die "$fn: $!";
  foreach my $x (sort (keys %deps, keys %leaves)) {
      print FILE (abs2rel($x, $o{basedir}) . "\n")
          unless $x eq '-';
  }
  close FILE;
}

if     ($o{format} eq 'make') { write_make($o{output}); }
elsif  ($o{format} eq 'dot')  { write_dot($o{output}); }
elsif  ($o{format} eq 'text')  { write_text($o{output}); }
else { die qq{unsupported output format "$o{format}"} };


=head1 NAME

thydeps - print Isabelle-theory dependencies.

=head1 SYNOPSIS

B<thydeps> [B<-h>|B<--help>]

B<thydeps> [B<-v>|B<--verbose>|B<-q>|B<--quiet>]
        [B<-D> I<dir>|B<--basedir> I<dir>]
        [B<--uses>|B<--no-uses>]
        [B<-r>|B<--recursive>|B<-R>|B<--no-recursive>]
        [B<-H> I<dir>|B<--isa-home> I<dir>]
        [B<-P> I<path>|B<--thypath> I<path>]
        [B<-i> I<name>|B<--ignore> I<name>] ...
        [B<-p> I<name>|B<--prune> I<name>] ...
        [B<-T> I<lang>|B<--format> I<lang>]
        [B<-G> I<attrs>|B<--graph-attrs> I<attrs>]
        [B<-E> I<attrs>|B<--edge-attrs> I<attrs>]
        [B<-C> I<attrs>|B<--cluster-attrs> I<attrs>]
        [B<-t> I<name>|B<--target> I<name>]
        [B<-o> I<outfile>|B<--output> I<outfile>]
        [I<files>]

=head1 DESCRIPTION

Reads the given theory or ROOT.ML I<files> and prints out a dependency graph.
Possible output format languages are make and dot.

=head1 OPTIONS

=over 4

=item B<-h>, B<--help>

prints a summary of the options and exits.

=item B<-v>, B<--verbose>

print debug information.

=item B<-q>, B<--quiet>

suppress warnings.

=item B<-D> I<dir>, B<--basedir> I<dir>

base directory.
In the output, all files names are specified relative to this directory.
If basedir is '//', file names are printed out absolute.

=item B<--uses> | B<--no-uses>

(dis)regard ML file dependencies specified with uses clause
Default is B<--uses>.

=item B<-r>, B<--recursive> | B<-R>, B<--no-recursive>

control recursive processing of theory files.
Default is B<--recursive>.

=item B<-H> I<dir>, B<--isa-home> I<dir>

set C<ISABELLE_HOME> directory in order to resolve C<~~> in theory files.
Defaults to the environment variable C<ISABELLE_HOME>.

=item B<-P> I<path>, B<--thypath> I<path>

set search path for theory files.
Multiple directories are separated with a colon (C<:>).

=item B<-i> I<name>, B<--ignore> I<name>

theories called I<name> are disregarded in dependencies.
Multiple theory names are specified by giving this option multiple times.

=item B<-p> I<name>, B<--prune> I<name>

theories called I<name> are not parsed for dependencies.
Multiple theory names are specified by giving this option multiple times.

=item B<-T> I<lang>, B<--format> I<lang>

specified the output format language of the dependencies.
Supported languages are 'make', 'dot' and 'text'.
Defaults to make.

=item B<-t> I<name>, B<--target> I<name>

Makefile target -- usually, the name of the heap file.

=item B<-o> I<outfile>, B<--output> I<outfile>

specifies the output file.
If this option is not specified, the file name defaults to thydeps with an
appropriate extension according the output format.
For standard output, specify C<->.

=back

=head1 SEE ALSO

L<dot(1)>, script source.
